#importing necessary packages
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
import sqlite3
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pickle
import sklearn.cross_validation
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import cross_validation
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix,roc_auc_score,roc_curve
After removing stopwords, punctuations, meaningless characters, HTML tags from Text and done stemming. Using it directly as it was alredy done in prevoius assignment
#Reading
conn= sqlite3.connect('cleanedTextData.sqlite')
data= pd.read_sql_query('''
SELECT * FROM Reviews
''',conn)
data=data.drop('index',axis=1)
data.shape
data.columns
data['CleanedText'].head(3)
def wordCloud(clusterPerReview,reviewText,k):
'''
Prints wordclouds of all the clusters given cluster number per review and review text
'''
clusterGroup={}
i=0
for c in clusterPerReview:
if c in clusterGroup.keys():
clusterGroup[c]+= reviewText[i]
else:
clusterGroup[c]=reviewText[i]
i+=1
print('So we have',k,'clusters here representing in wordcloud:')
for i in list(set(cluster)):
print('Cluster Number',i+1,':')
plt.figure()
plt.imshow(WordCloud().generate(clusterGroup[i]))
plt.axis("off")
This data has time attribute so it will be reasonable to do time based splitting instead of random splitting.
So, before splitting we have to sort our data according to time and here we are taking 50k points from our dataset(population)
data["Time"] = pd.to_datetime(data["Time"], unit = "ms")
data = data.sort_values(by = "Time")
#latest 50k points according to time
Xdata1= data[:50000]['CleanedText']
len(Xdata1)
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
def optimalKmeans(Xdata):
param_K = [2,3,4,5,6,7,8]
inertia={}
for K in param_K:
model= KMeans(n_clusters=K, init= 'k-means++', precompute_distances= True, n_jobs= -1)
model.fit(Xdata)
inertia[K]= model.inertia_
plt.plot(list(inertia.keys()), list(inertia.values()))
plt.xlabel("No. of cluster")
plt.ylabel("Inertia")
plt.show()
bestK= min(inertia, key=inertia.get)
print('The best K according to min inertia is ',bestK)
return bestK
Bow vectorization is basic technique to convert a text into numerical vector.
# vectorizing X and transforming
bowModel1=CountVectorizer()
XdataBOWV1=bowModel1.fit_transform(Xdata1)
XdataBOWV1.shape
#Standardizing vectors
XdataBOWV1 = StandardScaler(with_mean=False).fit_transform(XdataBOWV1)
k= optimalKmeans(XdataBOWV1)
BOWkm = KMeans(n_clusters=k, init= 'k-means++', precompute_distances= True, n_jobs= -1)
cluster= list(BOWkm.fit_predict(XdataBOWV1))
wordCloud(cluster,list(Xdata1),8)
# generating vetor out of text using tfidf
tfidfModel1=TfidfVectorizer()
XdataTFIDFV1= tfidfModel1.fit_transform(Xdata1)
#Standardizing vectors
XdataTFIDFV1 = StandardScaler(with_mean=False).fit_transform(XdataTFIDFV1)
k= optimalKmeans(XdataTFIDFV1)
TFIDFkm = KMeans(n_clusters=k, init= 'k-means++', precompute_distances= True, n_jobs= -1)
cluster= list(TFIDFkm.fit_predict(XdataTFIDFV1))
wordCloud(cluster,list(Xdata1),7)
import gensim
# training our gensim model on our train text
import re
import string
def cleanhtml(sentance): #substitute expression contained in <> with ' '
cleaned= re.sub(re.compile('<.*?>'),' ',sentance)
return cleaned
#function for removing punctuations chars
def cleanpunc(sentance):
cleaned= re.sub(r'[?|!|\'|"|#]',r'',sentance)
cleaned= re.sub(r'[.|,|)|(|\|/]',r'',sentance)
return cleaned
i=0
lists=[]
for sent in Xdata1.values:
filtered_sentence=[]
sent=cleanhtml(sent)
for w in sent.split():
for cleaned_words in cleanpunc(w).split():
if(cleaned_words.isalpha()):
filtered_sentence.append(cleaned_words.lower())
else:
continue
lists.append(filtered_sentence)
w2v_model= gensim.models.Word2Vec(lists,min_count=5,size=50,workers=4)
print(len(list(w2v_model.wv.vocab)))
w2v_words = list(w2v_model.wv.vocab)
# converting list of sentance into list of list of words
# then to vector using avg w2v
# function to convert list of list of words to vect using avg w2v
def w2vVect(X):
'''
This function takes list of sentance as input (X) and convert it into
list of list of words and then feed it into our gensim model to get vector
and then take its average, finally returns sent_vectors(vector of sentance)
*************GENSIM MODEL WAS TRAINED ON TRAINDATA***************
'''
lists=[]
for sent in X.values:
filtered_sentence=[]
sent=cleanhtml(sent)
for w in sent.split():
for cleaned_words in cleanpunc(w).split():
if(cleaned_words.isalpha()):
filtered_sentence.append(cleaned_words.lower())
else:
continue
lists.append(filtered_sentence)
sent_vectors = [];
for sent in lists:
sent_vec = np.zeros(50)
cnt_words =0;
for word in sent:
if word in w2v_words:
vec = w2v_model.wv[word]
sent_vec += vec
cnt_words += 1
if cnt_words != 0:
sent_vec /= cnt_words
sent_vectors.append(sent_vec)
return sent_vectors
# Vectorizing our data
XdataW2VV1= w2vVect(Xdata1)
#Standardizing vectors
XdataW2VV1 = StandardScaler(with_mean=False).fit_transform(XdataW2VV1)
k= optimalKmeans(XdataW2VV1)
W2Vkm = KMeans(n_clusters=k, init= 'k-means++', precompute_distances= True, n_jobs= -1)
cluster= list(W2Vkm.fit_predict(XdataW2VV1))
wordCloud(cluster,list(Xdata1),8)
tfmodel=TfidfVectorizer(max_features=2000)
tf_idf_matrix = tfmodel.fit_transform(Xdata1.values)
tfidf_feat=tfmodel.get_feature_names()
dictionary = {k:v for (k,v) in zip(tfmodel.get_feature_names(), list(tfmodel.idf_))}
def tfidfw2vVect(X):
'''
This function converts list of sentance into list of list of words and then
finally applies average-tfidf-w2w to get final sentance vector
w2v model and w2v words already made during w2v vectorization part
'''
lists=[]
for sent in X.values:
filtered_sentence=[]
sent=cleanhtml(sent)
for w in sent.split():
for cleaned_words in cleanpunc(w).split():
if(cleaned_words.isalpha()):
filtered_sentence.append(cleaned_words.lower())
else:
continue
lists.append(filtered_sentence)
tfidfw2v_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in lists: # for each review/sentence
sent_vec = np.zeros(50) # as word vectors are of zero length
weight_sum =0; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
if word in w2v_words:
vec = w2v_model.wv[word]
#tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
#to reduce the computation we are
#dictionary[word] = idf value of word in whole courpus
#sent.count(word) = tf valeus of word in this review
tf_idf = (dictionary[word])*((sent.count(word))/len(sent))
sent_vec += (vec * tf_idf)
weight_sum += tf_idf
except:
pass
if weight_sum != 0:
sent_vec /= weight_sum
tfidfw2v_sent_vectors.append(sent_vec)
row += 1
# converting nan and infinte values in vect to digit
tfidfw2v_sent_vectors= np.nan_to_num(tfidfw2v_sent_vectors)
return tfidfw2v_sent_vectors
# feeding text data and recieving vectorized data
XdataTFIDFW2VV1= tfidfw2vVect(Xdata1)
#Standardizing vectors
XdataTFIDFW2VV1 = StandardScaler(with_mean=False).fit_transform(XdataTFIDFW2VV1)
k= optimalKmeans(XdataTFIDFW2VV1)
TFIDFW2Vkm = KMeans(n_clusters=k, init= 'k-means++', precompute_distances= True, n_jobs= -1)
cluster= list(TFIDFW2Vkm.fit_predict(XdataTFIDFW2VV1))
wordCloud(cluster,list(Xdata1),8)
======================================================================================
from sklearn.cluster import AgglomerativeClustering
#latest 4k points according to time
Xdata2= data[:4000]['CleanedText']
len(Xdata2)
# Vectorizing our data
XdataW2VV2= w2vVect(Xdata2)
#Standardizing vectors
XdataW2VV2 = StandardScaler(with_mean=False).fit_transform(XdataW2VV2)
W2V2aglo= AgglomerativeClustering(n_clusters=7)
cluster= list(W2V2aglo.fit_predict(XdataW2VV2))
wordCloud(cluster,list(Xdata2), 7)
# feeding text data and recieving vectorized data
XdataTFIDFW2VV2= tfidfw2vVect(Xdata2)
#Standardizing vectors
XdataTFIDFW2VV2 = StandardScaler(with_mean=False).fit_transform(XdataTFIDFW2VV2)
TFIDFW2VV2aglo= AgglomerativeClustering(n_clusters=7)
cluster= list(TFIDFW2VV2aglo.fit_predict(XdataTFIDFW2VV2))
wordCloud(cluster,list(Xdata2),7)
======================================================================================
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
def NNforEPS(XdataVec,nn):
'''
performs elbow method and plots the curve after calculating nearest neighbors in vectors
'''
knn = NearestNeighbors(n_neighbors = nn, n_jobs= -1)
knn.fit(XdataVec)
dist , _ = knn.kneighbors(XdataVec)
sort = sorted(dist[:,nn-1])
plt.plot(list(range(1,len(XdataVec)+1)),sort,'r-')
plt.xlabel('n_samples')
plt.ylabel('Distance')
plt.title('elbow method for finding eps')
plt.grid()
plt.show()
#latest 4k points according to time
Xdata3= data[:4000]['CleanedText']
len(Xdata3)
# Vectorizing our data
XdataW2VV3= w2vVect(Xdata3)
#Standardizing vectors
XdataW2VV3 = StandardScaler(with_mean=False).fit_transform(XdataW2VV3)
NNforEPS(XdataW2VV3,10)
W2Vdbscan= DBSCAN(eps=3.5, min_samples=10, n_jobs=-1)
cluster= list(W2Vdbscan.fit_predict(XdataW2VV3))
wordCloud(cluster,list(Xdata3),len(set(cluster)))
# feeding text data and recieving vectorized data
XdataTFIDFW2VV3= tfidfw2vVect(Xdata3)
#Standardizing vectors
XdataTFIDFW2VV3 = StandardScaler(with_mean=False).fit_transform(XdataTFIDFW2VV3)
NNforEPS(XdataTFIDFW2VV3,5)
TFIDFW2Vdbscan= DBSCAN(eps=4, min_samples=5, n_jobs=-1)
cluster= list(TFIDFW2Vdbscan.fit_predict(XdataTFIDFW2VV3))
wordCloud(cluster,list(Xdata3),len(set(cluster)))
print('end\n\n\n\n\n')